/*==============================================================================
FILE NAME: Explore_Investigations.do
INPUTS: incidents_clean.dta, Notice_of_Violation_Clean.dta, Enforcements_Clean.dta,
OUTPUTS: investigations_with_noe_nov.dta
CREATED: 16 September 2021
UPDATED: 16 June 2022
==============================================================================*/

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Identify investigations linked to non-referred complaint incidents
use "$processed_data/incidents_clean.dta", clear
drop if IncidentStatus=="REFERRED"
keep if IN!=.
gen complaint_inv=1
label var complaint_inv "=1 if investigation is linked to complaint incident"
keep IN complaint_inv
duplicates drop
// IN uniquely identifies obs at this point
sort IN
save "$processed_data/temp_CIN.dta", replace
// This file identifies investigations that are linked to non-referred complaint incidents in the incidents data file

// Identify investigations present in investigations data
use "$processed_data/investigations_clean.dta", clear
destring IN, replace
keep IN
duplicates drop
sort IN
merge 1:1 IN using "$processed_data/temp_CIN.dta"
keep if _merge==2
drop _merge
gen nolink=1
label var nolink "=1 to identify a complaint-linked investigation not found in investigations file"
save "$processed_data/nolink.dta", replace
// This file identifies investigations that are linked to air complaint incidents but are not found in the investigations data file

// Identify investigations that resulted in NOV
use "$processed_data/Notice_of_Violation_Clean.dta", clear 
destring IN, replace
keep IN violation
duplicates drop
label var violation "=1 if investigation resulted in NOV"
sort IN
save "$processed_data/temp_NOV.dta", replace
// This file identifies investigations that show up in the NOV data file

// Identify investigations that resulted in NOE
use "$processed_data/Enforcements_Clean.dta", clear 
destring IN, replace
keep IN enforcement
duplicates drop
label var enforcement "=1 if investigation resulted in NOE"
sort IN
save "$processed_data/temp_NOE.dta", replace
// This file identifies investigations that show up in the NOE data file

// Identify all investigations in the investigations data file
use "$processed_data/investigations_clean.dta"
destring IN, replace
keep IN
duplicates drop
sort IN
save "$processed_data/temp_IN.dta", replace

// Start with all investigations and merge with NOV and NOE files to identify investigations that resulted in NOV, NOE, or both
clear
use "$processed_data/temp_IN.dta"
merge 1:1 IN using "$processed_data/temp_NOV.dta"
drop if _merge==2
drop _merge
// Only 3 observations from NOV file fail to merge with IN file
sort IN
merge 1:1 IN using "$processed_data/temp_NOE.dta"
// Only 5 observations from NOE file fail to merge with IN file
drop if _merge==2
drop _merge
replace violation=0 if violation==.
replace enforcement=0 if enforcement==.
sort IN
save "$processed_data/IN_NOV_NOE.dta", replace

// Distinguish between complaint investigations and non-complaint investigations
use "$processed_data/IN_NOV_NOE.dta", clear
merge 1:1 IN using "$processed_data/temp_CIN.dta"
replace complaint_inv=0 if _merge==1
// Remove nolink complaint investigations as these will be appended later
drop if _merge==2
drop _merge
gen nov_or_noe=0
replace nov_or_noe=1 if violation==1 | enforcement==1
label var nov_or_noe "=1 if investigation led to NOV or NOE"
sort IN
save "$processed_data/IN_NOV_NOE.dta", replace

// Link back to investigations data to get year of investigation
use "$processed_data/investigations_clean.dta", clear
destring IN, replace
label var year "investigation year"
sort IN
merge m:1 IN using "$processed_data/IN_NOV_NOE.dta"
drop _merge
keep IN complaint_inv violation enforcement nov_or_noe year
duplicates drop
save "$processed_data/IN_NOV_NOE.dta", replace
// This file will be appended to nolink complaint investigations

// Back to nolink complaint investigations
// We need to merge the nolink complaint investigations to the NOV and NOE data files by VN (not IN)
// So first create NOV and NOE files identified by VN
use "$processed_data/Notice_of_Violation_Clean.dta", clear
keep VN violation
duplicates drop
label var violation "=1 if investigation resulted in NOV"
sort VN
save "$processed_data/temp_NOV.dta", replace

use "$processed_data/Enforcements_Clean.dta", clear
keep VN enforcement
duplicates drop
label var enforcement "=1 if investigation resulted in NOE"
sort VN
save "$processed_data/temp_NOE.dta", replace

use "$processed_data/nolink.dta", clear
sort IN
merge 1:m IN using "$processed_data/incidents_clean.dta"
keep if _merge==3
drop _merge
sort VN
merge m:1 VN using "$processed_data/temp_NOV.dta"
drop if _merge==2
replace violation=0 if _merge==1
rename _merge merge_NOV
sort VN
merge m:1 VN using "$processed_data/temp_NOE.dta"
drop if _merge==2
replace enforcement=0 if _merge==1
rename _merge merge_NOE
gen nov_or_noe=0
replace nov_or_noe=1 if violation==1 | enforcement==1
tab merge_NOV merge_NOE
tab year merge_NOV
tab year merge_NOE
// Only 18 \"no link\" investigations link to an NOV, all in 2020
// Only 14 \"no link\" investigations link to an NOE, all in 2003
keep IN complaint_inv nolink violation enforcement nov_or_noe InvestigationStartDate
gen year=year(InvestigationStartDate)
drop InvestigationStartDate
duplicates drop
sort IN year
collapse (sum) violation enforcement nov_or_noe (max) nolink complaint_inv, by(IN year)
// Append to linked investigations
append using "$processed_data/IN_NOV_NOE.dta"
replace nolink=0 if nolink==.
label var year "investigation year"
label var violation "=1 if investigation resulted in NOV"
label var enforcement "=1 if investigation resulted in NOE"
label var nov_or_noe "=1 if investigation resulted in NOV or NOE"
sort IN year
save "$processed_data/investigations_with_noe_nov.dta", replace

// Clean up temporary files
erase "$processed_data/temp_IN.dta"
erase "$processed_data/temp_CIN.dta"
erase "$processed_data/nolink.dta"
erase "$processed_data/temp_NOV.dta"
erase "$processed_data/IN_NOV_NOE.dta"
erase "$processed_data/temp_NOE.dta"